#' ---
#' title: "Text as Behavior: Asian American Partisanship"
#' date: "`r Sys.Date()`"
#' output: html_document
#' header-includes:
#'  - \usepackage{booktabs}
#'  - \usepackage{longtable}
#'  - \usepackage{array}
#'  - \usepackage{multirow}
#'  - \usepackage{wrapfig}
#'  - \usepackage{float}
#'  - \usepackage{colortbl}
#'  - \usepackage{pdflscape}
#'  - \usepackage{tabu}
#'  - \usepackage{threeparttable}
#'  - \usepackage{threeparttablex}
#'  - \usepackage[normalem]{ulem}
#'  - \usepackage{makecell}
#'  - \usepackage{dcolumn}
#'  - \usepackage{setspace}\doublespacing
#'  - \setcounter{totalnumber}{1}
#' ---


## ---- aap_spin_setup, eval = FALSE, include = FALSE ----
# spin code to output Rmd / Rnw
# set knit = FALSE to generate Rmd / Rnw but not compile

# rmarkdown::render(input = here::here("text_code", "text_asian_american_partisanship.R"), output_format = "pdf_document", clean = FALSE)

# ********************************************************************************************************
# * Study Title: Social Exclusion and Political Identity: The Case of Asian American Partisanship
# * Replication File for Experimental Study
# * Date: May 9, 2016
# * Written By: Alexander Kuo, Neil Malhotra, and Cecilia Hyunjung Mo
# * Dataset: experimentaldata_kmm_050916.dta
# ********************************************************************************************************
# 
# use experimentaldata_kmm_050916.dta, clear

## ---- aap_packages, include = FALSE ----
source(here::here("text_code/text_packages_rep.R"))
source(here::here("text_code/custom_ggplot_themes.R"))

## --- aap_load_data, include = FALSE ----

aap <- read_dta(here("text_data_raw", "experimentaldata_kmm_050916_final.dta"))

## ---- aap_setup2, include = FALSE ----

#

source(here("text_code", "custom_table_functions_rep.R"))

star_format <- "latex"

# global knitr options
knitr::opts_chunk$set(
    fig.height = 4,
    dev        = c("png", "pdf"),
    dpi        = 288
    )

# functions

# alternative to Wilcoxon is rank transformation plus inverse normal cdf transformation
# https://statmodeling.stat.columbia.edu/2015/07/13/dont-do-the-wilcoxon/
rank_inverse_normal <- function(x) {
  qnorm((rank(x, na.last = "keep") - 0.5) / sum(!is.na(x)))
}

row_not_zero <- function(x) { 
  apply(X = x, MARGIN = 1, FUN = function(z) {sum(z != 0)} ) }

row_zero <- function(x) { 
  apply(X = x, MARGIN = 1, FUN = function(z) {sum(z == 0)} ) }

## ---- plot_themes ----

# plot theme
theme_set(theme_minimal(base_size = 13))
set_theme(base = theme_minimal(base_size = 13))


## ---- aap_clean_data, include = FALSE ----

  
aap <- aap %>%
  rename(
    responseid            = v1,
    startdate             = v8,
    enddate               = v9,
    finished              = v10,
    assigned_id           = q11,
    agreed_to_participate = q12
  )


aap <- aap %>%
  mutate(
    date_start  = str_split(startdate, " ", simplify = TRUE)[,1],
    time_start  = str_split(startdate, " ", simplify = TRUE)[,2],
    date_end    = str_split(enddate, " ", simplify = TRUE)[,1],
    time_end    = str_split(enddate, " ", simplify = TRUE)[,2],
    duration    = hm(time_end) - hm(time_start),
    time        = as.duration(duration) %>% as.numeric(),
    time_min    = time / 60,
    time_norm   = rank_inverse_normal(time),
    writers_block = (time > 1800) %>% as.numeric()
    # start       = ends(startdate), tail,
    # ends        = ends(enddate), tail,
    # start_min   = substr(start,-2,2),
    # end_min     = substr(ends,-2,2),
    # start_hr    = ends(start), punct(:),
    # end_hr      = ends(ends), punct(:),
    # time_start  = 60*start_hr + start_min,
    # time_end    = 60*end_hr + end_min,
    # time        = time_end - time_start
  )

aap <- aap %>%
  mutate(
    treatment_cit = case_when(
      treatmentran    == "T" ~ 1,
      treatmentran    == "C" ~ 0
    ) %>% as.factor(),
    treatment_fct = case_when(
        treatmentran    == "T" ~ "Treated",
        treatmentran    == "C" ~ "Control"
    ) %>% as.factor()
    
  )


# **************************
# ****** DEMOGRAPHICS ******
# **************************

asian_regex <- "Asian|Indian|Pakistan|Islander"

#*** Gender (Female = 1)
aap <- aap %>%
  mutate(
    female = case_when(
      q21 == 1 ~ 1,
      q21 == 2 ~ 0),
    
    yborn = 1900 + q22,
    years = 2013 - (yborn + 1),
    age   = (years - 16) / 39,
    ed    = (q23 - 2) / 4,
    
    hs = case_when(
      !is.na(q23) ~ 0,
      q23 == 2    ~ 1
    ),

    somecol = case_when(
      !is.na(q23) ~ 0,
      q23 == 3    ~ 1
    ),

    col = case_when(
      !is.na(q23) ~ 0,
      q23 == 5    ~ 1
    ),

    post = case_when(
      !is.na(q23) ~ 0,
      q23 == 6    ~ 1
    ),

    ethnicity = case_when(
      q249 == 1 ~ "White",
      q249 == 2 ~ "Black",
      q249 == 3 ~ "Hispanic/Latino",
      q249 == 4 ~ "Asian/Asian American",
      q249 == 5 ~ "Other"
    ),

    eth_text = ethnicitywhiteblackhispaniclatin,
    
    asian = case_when(
      q249 == 4 ~ 1,
      #str_detect(eth_text, ) ~ 1,
      TRUE ~ 0
    ),
      
  white = ifelse(q249 == 1, 1, 0),

  white_fct = case_when(
    q249 == 1 ~ "White",
    TRUE      ~ "Nonwhite"
  ) %>% fct_relevel(c("White", "Nonwhite")),
  
  asian_white = case_when(
    asian == 1 ~ 1,
    white == 1 ~ 0,
    TRUE ~       NA_real_
    ),

  asian_white1 = case_when(
    q249  == 4 | str_detect(eth_text, asian_regex) == 1 ~ 1,
    white == 1 ~ 0,
    TRUE ~       NA_real_
  ) %>% as.factor(),

  asian_white_other_cond = case_when(
      ethnicity  == "Asian/Asian American" & treatment_cit == 1 ~ "Asian Treated",
      ethnicity  == "Asian/Asian American" & treatment_cit == 0 ~ "Asian Control",
      ethnicity  == "White" & treatment_cit == 1 ~ "White Treated",
      ethnicity  == "White" & treatment_cit == 0 ~ "White Control",
      treatment_cit == 1 ~ "Other Treated",
      treatment_cit == 0 ~ "Other Control"
  ) %>% fct_relevel(rev(c("Asian Treated", "Asian Control", "Other Treated", "Other Control", "White Treated", "White Control"))),


  asian_white_cond = case_when(
      ethnicity  == "Asian/Asian American" & treatment_cit == 1 ~ "Asian Treated",
      ethnicity  == "Asian/Asian American" & treatment_cit == 0 ~ "Asian Control",
      ethnicity  == "White" & treatment_cit == 1 ~ "White Treated",
      ethnicity  == "White" & treatment_cit == 0 ~ "White Control",
      TRUE ~ NA_character_
  ) %>% fct_relevel(c("White Control", "White Treated", "Asian Control", "Asian Treated" )),

  asian_white_fct = case_when(
      ethnicity  == "Asian/Asian American" ~ "Asian",
      ethnicity  == "White" ~ "White",
      TRUE ~ NA_character_
  ) %>% fct_relevel(c("White", "Asian")),
  
    
# asian_white_other_cond = case_when(
#     (q249  == 4 | str_detect(eth_text, asian_regex) == 1)  & treatment_cit == 1 ~ "Asian Treated",
#     (q249  == 4 | str_detect(eth_text, asian_regex) == 1)  & treatment_cit == 0 ~ "Asian Control",
#     q249  == 1 & treatment_cit == 1 ~ "White Treated",
#     q249  == 1 & treatment_cit == 0 ~ "White Control",
#     treatment_cit == 1 ~  "Other Treated",
#     treatment_cit == 0 ~  "Other Control"
#   ) %>% fct_relevel(rev(c("Asian Treated", "Asian Control", "Other Treated", "Other Control", "White Treated", "White Control"))),

  white_cond = case_when(
    q249  == 1 & treatment_cit == 1 ~ "White Treated",
    q249  == 1 & treatment_cit == 0 ~ "White Control",
    q249  >  1 & treatment_cit == 1 ~ "Nonwhite Treated",
    q249  >  1 & treatment_cit == 0 ~ "Nonwhite Control"
  ) %>% fct_relevel(c("White Control", "White Treated",  "Nonwhite Control", "Nonwhite Treated")),

  nonwhite_cond = case_when(
      q249  >  1 & treatment_cit == 1 ~ "Nonwhite Treated",
      q249  >  1 & treatment_cit == 0 ~ "Nonwhite Control",
      TRUE ~ NA_character_
  ) %>% fct_relevel(c("Nonwhite Treated", "Nonwhite Control")),
  
  
  asian_cond = case_when(
      (q249  == 4 | str_detect(eth_text, asian_regex) == 1) & 
          treatment_cit == 1 ~ "Asian Treated",
      (q249  == 4 | str_detect(eth_text, asian_regex) == 1) & 
          treatment_cit == 0 ~ "Asian Control",
      TRUE ~ NA_character_
  ) %>% fct_relevel(c("Asian Control", "Asian Treated")),

  other_cond = case_when(
      asian_white_other_cond == "Other Treated" ~  "Other Treated",
      asian_white_other_cond == "Other Control" ~  "Other Control",
      TRUE ~ NA_character_
  ) %>% fct_relevel(rev(c("Other Treated", "Other Control"))),
  
    
  # asian_white_cond = case_when(
  #   (q249  == 4 | str_detect(eth_text, asian_regex) == 1) & 
  #     treatment_cit == 1 ~ "Asian Treated",
  #   (q249  == 4 | str_detect(eth_text, asian_regex) == 1) & 
  #     treatment_cit == 0 ~ "Asian Control",
  #   q249  == 1 & treatment_cit == 1 ~ "White Treated",
  #   q249  == 1 & treatment_cit == 0 ~ "White Control",
  #   TRUE ~ NA_character_
  # ) %>% fct_relevel(c("Asian Treated", "Asian Control", "White Treated", "White Control")),
  
  # ends(ethnicitywhiteblackhispaniclatin), punct(" ")
  #   g asian = 1 if q249 == 4 
  #   recode asian (.=1) if ethnicity_check == "Asian" | ethnicity_check == "Asian-" | ethnicity_check == "Asian/Mixed" | ethnicity_check == "Indian"
  #   recode asian (.=1) if q249_text == "Filipino" | q249_text == "Hispanic/ Asian" | q249_text == "Vietnamese Iranian" 
  #   recode asian (.=0)
  #   label var asian "Asian"
  #   g white = 1 if q249 == 1
  #   recode white (.=0)
  #   label var white "White"
  #   move asian q249
  #   move white q249
    
  asian_ethnicity = case_when(
    q250_1  == 1  ~ 1,
    q250_2  == 1  ~ 2,
    q250_3  == 1  ~ 3,
    q250_4  == 1  ~ 4,
    q250_5  == 1  ~ 5,
    q250_6  == 1  ~ 6,
    q250_7  == 1  ~ 7,
    q250_8  == 1  ~ 8,
    q250_9  == 1  ~ 9,
    q250_10 == 1 ~ 10,
    q250_11 == 1 ~ 11,
    q250_12 == 1 ~ 12,
    q250_13 == 1 ~ 13,
    q250_14 == 1 ~ 14,
    q250_15 == 1 ~ 15,
    q250_16 == 1 ~ 16,
    q250_17 == 1 ~ 17,
    q250_18 == 1 ~ 18,
    q250_19 == 1 ~ 19,
    q250_20 == 1 ~ 20,
    q250_21 == 1 ~ 21,
    q250_22 == 1 ~ 22,
    q250_23 == 1 ~ 23,
    q250_24 == 1 ~ 24,
    q250_25 == 1 ~ 25,
    q250_26 == 1 ~ 26,
    q250_27 == 1 ~ 27,
    q250_28 == 1 ~ 28,
    q250_29 == 1 ~ 29
    ) %>% as.factor()

  
  # * restrict data to Asians and whites
  # gen group1 = (q249==4|q249==1) 
  # gen asiant = asian==1 if asian==1|q249==1
  # gen asiant_treat = asiant*treatment_cit
  
  )


# test <- aap %>% 
#   mutate_at(vars(contains("text")), list(length = str_length) ) %>% 
#   mutate(nchar_sum = select(., contains("length")) %>% rowSums() )

# "q219" "q221" "q223" "q225" "q249" "q250" "q261"


zeroize = function(x) {
  # apply penalty for nonresponse
  x <- x + ifelse(x == 0, -1 * (max(x) * .10) %>% round(), 0)
  # rescale so min = 0 for count models 
  x <- x + abs(min(x))
  }

str_any <- function(x){(str_length(x) > 0) %>% as.numeric(.)}

aap <- aap %>% 
  # for every text column, calc string length
  mutate_at(vars(contains("text")), list(length = str_length) ) %>% 
  mutate_at(vars(contains("text")), list(count  = str_any ) ) %>% 
    
  mutate(
    # for all lengths, calc total length
    #nchar_sum      = select(., contains("length")) %>% rowSums(), 
    #nchar_sum_norm = rank_inverse_normal(nchar_sum),

        
    # calculate, by question, total length
    # dem likes
    nchar_219_sum = select(., matches("q219.*length")) %>% rowSums(), 
    # dem dislikes
    nchar_221_sum = select(., matches("q221.*length")) %>% rowSums(), 
    # rep likes
    nchar_223_sum = select(., matches("q223.*length")) %>% rowSums(), 
    # rep dislikes
    nchar_225_sum = select(., matches("q225.*length")) %>% rowSums(), 
    # empty (originally ethnicity?)
    nchar_249_sum = select(., matches("q249.*length")) %>% rowSums(), 
    # mostly empty
    nchar_250_sum = select(., matches("q250.*length")) %>% rowSums(), 
    # name politician test
    nchar_261_sum = select(., matches("q261.*length")) %>% rowSums(), 
    
    nchar_eth_sum = select(., matches("eth_.*length")) %>% rowSums()
    
  )


aap <- aap %>% 
    mutate(
    # just the five writing tasks, not ethnicity column or semi-empty
    nchar_sum      = select(., matches("nchar_219_sum|nchar_221_sum|nchar_223_sum|nchar_225_sum|nchar_261_sum")) %>% rowSums(), 
    
    nchar_count      = select(., matches("q219_.*_text_count|q221_.*_text_count|q223_.*_text_count|q225_.*_text_count|q261_.*_text_count")) %>%  rowSums(), 
    nchar_sum_norm = rank_inverse_normal(nchar_sum),
    
    # likes = pos ingroup - negative ingroup
    nchar_demlikes = (nchar_219_sum - nchar_221_sum),
    nchar_replikes = (nchar_223_sum - nchar_225_sum),
    
    nchar_netlikes = nchar_demlikes - nchar_replikes,

    # affect = pos ingroup + negative outgroup
    nchar_demaffect = (nchar_219_sum + nchar_225_sum),
    nchar_repaffect = (nchar_223_sum + nchar_221_sum),
    
    nchar_netaffect = nchar_demaffect - nchar_repaffect,
    
    # there don't appear to be any non-responders in affect questions
    # so maybe kill this?
    nchar_demaffect_zero = zeroize(nchar_219_sum) + zeroize(nchar_225_sum),
    nchar_repaffect_zero = zeroize(nchar_223_sum) + zeroize(nchar_221_sum),
    
    # NEED TO THINK MORE ABOUT WEIGHTING OF ZERO / RESCALING
    nchar_netaffect_zero = zeroize(nchar_netaffect),
    nchar_netlikes_zero  = zeroize(nchar_netlikes), 
    #nchar_netaffect_zero = zeroize(nchar_demlikes - nchar_replikes),
    # nchar_demaffect_zero = zeroize(nchar_demaffect),
    # nchar_repaffect_zero = zeroize(nchar_repaffect),

        
    # calc netlikes and shift distribution to start at zero for count models
    #nchar_netlikes_zero = zeroize(nchar_netlikes), 
    #ifelse((nchar_demlikes - nchar_replikes) == 0, TRUE, FALSE),
    #nchar_netlikes = nchar_demlikes - nchar_replikes,
    # shift so min = 0 for count models
    #nchar_netlikes2 = nchar_netlikes + abs(min(nchar_netlikes)), 
    # rescale to penalize non-response at 10% of max
    #nchar_netlikes_penalty = -1 * (max(nchar_netlikes2) * .10) %>% round(),
    #nchar_netlikes_zero = nchar_netlikes2 + ifelse(nchar_netlikes_zero, nchar_netlikes_penalty, 0),
    
    
    # calc netlikes and shift distribution to start at zero for count models
    #nchar_netaffect_zero = ifelse((nchar_demaffect - nchar_repaffect) == 0, TRUE, FALSE),
    
    # nchar_netaffect       = nchar_demaffect - nchar_repaffect,
    # nchar_netaffect_zero  = nchar_demaffect_zero - nchar_repaffect_zero,
    
    # nchar_netaffect2 = nchar_netaffect + abs(min(nchar_netaffect)),
    # nchar_netaffect_penalty = -1 * (max(nchar_netaffect2) * .10) %>% round(),
    # nchar_netaffect3 = nchar_netaffect2 + ifelse(nchar_netlikes_zero, nchar_netlikes_penalty, 0),
    
    
    nchar_rate          = nchar_sum / time_min,
    
    # likes = pos ingroup - negative ingroup
    nchar_demlikes_time = nchar_demlikes / time,
    nchar_replikes_time = nchar_replikes / time,
    
    # affect = pos ingroup + negative outgroup
    nchar_demaffect_time = nchar_demaffect / time,
    nchar_repaffect_time = nchar_repaffect / time
    
    #nchar_netlikes_zero = nchar_netlikes_zero + abs(min(nchar_netlikes_zero)) 
  )


# count number of entries with nchar > 2 in a vector
row_count <-
    function(x) {
        #count <- nchar(x)
        non_zero <- (x > 2)
        total    <- sum(non_zero)
        return(total)
    }

aap <- aap %>% 
    mutate(
      q261_count    = select(., matches("q261.*count")) %>% rowSums(), 
    ) %>% 
    ungroup()

#aap %>% head() %>% View()


# nchar_df <- aap %>%
#   select(contains("text")) %>% 
#   mutate_all(.funs = nchar
#   ) 
# 
# nchar_261 <- nchar_df %>% 
#   select(contains("q261_")) %>% 
#   mutate(
#     nchar_261 = rowSums(.)
#   )
# 
# nchar_df <- nchar_df %>% 
#   mutate(
#     nchar_sum = rowSums(.)
#   )
# 
# aap <- bind_cols(aap, nchar_261 %>% select(nchar_261))
# aap <- bind_cols(aap, nchar_df  %>% select(nchar_sum))



# aap <- aap %>% 
#   mutate(
#     nchar_261_bin      = case_when(nchar_261 == 0 ~ 0, TRUE ~ 1),
#     nchar_tot_bin      = case_when(nchar_tot == 0 ~ 0, TRUE ~ 1),
#     nchar_261_rescaled = case_when(nchar_261 != 0 ~ nchar_261 + 25, TRUE ~ nchar_261),
#     nchar_tot_rescaled = case_when(nchar_tot != 0 ~ nchar_tot + 25, TRUE ~ nchar_tot)
#   )


aap <- aap %>% 
  mutate(
    partyid   = q227 %>% as.factor(),
    strongrep = case_when(
      q228 == 2 ~ 0,
      TRUE      ~ q228),
    strongdem = case_when(
      q229 == 2 ~ 0,
      TRUE      ~ q229),
    liberal_leaning_rep = case_when(
      q230 == 2 ~ 0,
      TRUE      ~ q230),
    pID = case_when(
      strongdem           == 1 ~ 1,
      strongdem           == 0 ~ 2,
      liberal_leaning_rep == 0 ~ 3,
      liberal_leaning_rep == 1 ~ 4,
      strongrep           == 0 ~ 5,
      strongrep           == 1 ~ 6
  ),
  dem  = (pID < 4) %>% as.factor(), 
  pid   = (6 - pID) / 5,
  ideo  = (7-q231)/6,
  ideo2 = 8-q231,
  
  citizen = 5-q254,
  us = case_when(
    citizen == 1 ~ 0, 
    citizen >  1 ~ 1),
   citizen2 = (citizen >= 2) %>% as.numeric(),
  
  relig_fct = q257,
  relig_bin = case_when(
    !is.na(q257) ~ 0,
    q257 != 8 & !is.na(q257) ~ 1
  )
  ) 

  

# test
# row_not_zero(c(1,0,0,0,0,0,0:5))  

aap <- aap %>% 
  mutate(
    # *** Republican: Close-minded
    rep1         = (q29_1 - 1) / 3,
    # *** Democrat: Close-minded
    dem1         = (q210_1 - 1) / 3,
    # *** Republican: Ignorant
    rep9         = (q29_11 - 1) / 3,
    # *** Democrat: Ignorant
    dem9         = (q210_11 - 1) / 3,
    # *** Republican Party Represents Interest of people like yourself
    frep1        = (4 - q211) / 3,
    # *** Democratic Party Represents Interest of people like yourself
    fdem1        = (4 - q212) / 3,
    # *** Republican Party Thermometer
    frep2        = q213_1 / 89,
    # *** Democratic Party Thermometer
    fdem2        = (q214_1 - 5) / 91,
    # *** Net Closed Minded
    v1           = ((rep1 - dem1) - 1) / (-2),
    # *** Net Ignorant
    v2           = ((rep9 - dem9) - 1) / (-2),
    # *** Net Likes
    dem_likes    = (select(., matches("q219.*length")) %>% row_not_zero(.)),
    dem_dislikes = (select(., matches("q221.*length")) %>% row_not_zero(.)),
    rep_likes    = (select(., matches("q223.*length")) %>% row_not_zero(.)),
    rep_dislikes = (select(., matches("q225.*length")) %>% row_not_zero(.)),
    rep_netlikes = rep_likes - rep_dislikes,
    dem_netlikes = dem_likes - dem_dislikes,
    netlikes     = dem_netlikes - rep_netlikes,
    # weight non-response (IGNORE for now)
    netlikes2    = netlikes, # + ifelse(netlikes == 0, -2, 0),
    # rescale to start at zero
    netlikes2    = netlikes2 + abs(min(netlikes2)),
    v3           = (netlikes + 11) / 28,
    
  
    
    # *** PID
    v4           = pid,
    # *** Thermometer
    v5           = ((fdem2 - frep2) + 1) / 2,
    # *** Represent
    v6           = ((fdem1 - frep1) + 1) / 2,
    # *** Time on list elected
    timetask     = (select(., matches("q261.*length")) %>% row_not_zero()),
    ln_timetask  = log(timetask + 1),
    v7           = ln_timetask,
    ln_time      = log(time + 1),
    ln_timetaken = log(q262_3),
    v8           = ln_timetaken,
    
    # *** Pro-Democratic Party Index
    study2_avg   = (v1 + v2 + v3 + v4 + v5 + v6) / 6
    # replace v6 = (v6
    # gen rep_netlikes = rep_likes-rep_dislikes
    # gen dem_netlikes = dem_likes-dem_dislikes
    # gen netlikes = dem_netlikes-rep_netlikes
    # gen v3 = netlikes
    # replace v3 = (v3+11)/28

    # *** Time
    # egen timetask = rownonmiss(q261*), strok
    # gen ln_timetask = log(timetask+1)
    # gen v7 = ln_timetask
    # 
    # gen ln_timetaken= log(q262_3)
    # gen v8 = ln_timetaken
    # 
 
  )




## --- save-updated-data ---

save(aap, file = here("text_data_output", "aap_processed.Rdata"))

